In [60]:
import numpy as np
import pandas as pd
import mysql.connector
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
In [2]:
try:
    mydb = mysql.connector.connect(
        host = 'localhost',
        port = '3306',
        user = 'root',
        password = 'root'
    )
    songs = pd.read_sql('SELECT year, country, style, language, final_place, final_total_points FROM eurovision.song_data', con=mydb)
except mysql.connector.Error as err:
    # Catch the error and load data from CSV file instead
    print("Error: {}".format(err))

    # Load data from CSV file into a Pandas dataframe
    songs = pd.read_csv('song_data.csv')
C:\Users\Rytis\AppData\Local\Temp\ipykernel_19420\1171909835.py:8: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
  songs = pd.read_sql('SELECT year, country, style, language, final_place, final_total_points FROM eurovision.song_data', con=mydb)
In [3]:
songs
Out[3]:
year country style language final_place final_total_points
0 2022 Albania Pop Albanian, English NaN NaN
1 2022 Latvia Pop English NaN NaN
2 2022 Lithuania Ballad Lithuanian 14.0 128.0
3 2022 Switzerland Ballad English 17.0 78.0
4 2022 Slovenia Pop Slovene NaN NaN
... ... ... ... ... ... ...
523 2009 France Ballad French 8.0 218.0
524 2009 Russia Ballad Russian, Ukrainian 11.0 185.0
525 2009 Germany Pop English 20.0 91.0
526 2009 United Kingdom Ballad English 5.0 328.0
527 2009 Spain Dance Spanish, English 24.0 47.0

528 rows × 6 columns

Hipotezės¶

  1. Tam tikri muzikos žanrai dažniau patenka į finalą (laimetojų sąraše arba apskritai)
  2. Populiarausias žanras surenka vidutiniškai daugiau tašku nei kiti
  3. Ar daugiau tašku surinka anglu kalbų dainos ar ne?

Duomenų valymas¶

In [42]:
songs = songs.dropna(subset=['final_place'])
songs = songs.reset_index(drop=True)
songs = songs.replace("English6", "English")
songs = songs.replace("Greek1", "Greek")
songs = songs.replace("Italian[f]", "Italian")
songs
Out[42]:
year country style language final_place final_total_points Mean Value
0 2022 Lithuania Ballad Lithuanian 14.0 128.0 NaN
1 2022 Switzerland Ballad English 17.0 78.0 NaN
2 2022 Ukraine Traditional Ukrainian 1.0 631.0 NaN
3 2022 Netherlands Ballad Dutch 11.0 171.0 NaN
4 2022 Moldova Traditional Romanian, English 7.0 253.0 NaN
... ... ... ... ... ... ... ...
327 2009 France Ballad French 8.0 218.0 NaN
328 2009 Russia Ballad Russian, Ukrainian 11.0 185.0 NaN
329 2009 Germany Pop English 20.0 91.0 NaN
330 2009 United Kingdom Ballad English 5.0 328.0 NaN
331 2009 Spain Dance Spanish, English 24.0 47.0 NaN

332 rows × 7 columns

Laimėtojų sąrašas¶

In [5]:
winners = songs.loc[songs['final_place'] == 1]
winners
Out[5]:
year country style language final_place final_total_points
2 2022 Ukraine Traditional Ukrainian 1.0 631.0
48 2021 Italy Rock Italian 1.0 524.0
66 2019 Netherlands Ballad English 1.0 498.0
78 2018 Israel Pop English 1.0 529.0
105 2017 Portugal Ballad Portuguese 1.0 758.0
144 2016 Ukraine Pop English, Crimean Tatar 1.0 534.0
169 2015 Sweden Pop English 1.0 625.0
192 2014 Austria Ballad English 1.0 520.0
206 2013 Denmark Pop English 1.0 0.0
245 2012 Sweden Dance English 1.0 639.0
265 2011 Azerbaijan Ballad English 1.0 405.0
306 2010 Germany Pop English 1.0 430.0
318 2009 Norway Traditional English 1.0 690.0

Populiariausi žanrai¶

In [6]:
styleCounts = songs['style'].value_counts()
styleCounts
Out[6]:
style
Pop            150
Ballad          91
Dance           38
Traditional     26
Rock            24
Opera            3
Name: count, dtype: int64
In [7]:
groupedSongs = songs.groupby(['year', 'style']).size().reset_index(name='count')
groupedSongs
Out[7]:
year style count
0 2009 Ballad 9
1 2009 Dance 5
2 2009 Pop 5
3 2009 Rock 1
4 2009 Traditional 5
... ... ... ...
57 2022 Ballad 12
58 2022 Dance 1
59 2022 Pop 6
60 2022 Rock 1
61 2022 Traditional 5

62 rows × 3 columns

In [47]:
# create a scatter plot
plt.scatter(x=groupedSongs['year'], y=groupedSongs['style'], s=groupedSongs['count']*20)

# set axis labels and title
plt.xlabel('Year')
plt.ylabel('Style')
plt.title('Occurrence of Style by Year')

# show the plot
plt.show()
In [9]:
groupedWinners = winners.groupby(['style']).size().reset_index(name='count')
groupedWinners
Out[9]:
style count
0 Ballad 4
1 Dance 1
2 Pop 5
3 Rock 1
4 Traditional 2

Pirmos Hipotezės išvados¶

Pop žanras yra ko ne pastoviai populiariausias žanras kiekvienais metais pagal dainų pasiekusiu finalą kiekį. Taip pat pop dainos yra laimėjusios daugiausiai eurovizijų.

Žanrų vidutiniai taškai¶

In [48]:
# group by year and style and compute mean
mean_by_year_style = songs.groupby(['year', 'style'])['final_total_points'].mean()

# create new DataFrame with mean values
meanSongs = pd.DataFrame(mean_by_year_style).reset_index()
meanSongs.columns = ['year', 'style', 'mean_by_year_style']

meanSongs
Out[48]:
year style mean_by_year_style
0 2009 Ballad 213.111111
1 2009 Dance 162.000000
2 2009 Pop 151.600000
3 2009 Rock 160.000000
4 2009 Traditional 245.200000
... ... ... ...
57 2022 Ballad 162.833333
58 2022 Dance 38.000000
59 2022 Pop 229.500000
60 2022 Rock 38.000000
61 2022 Traditional 246.600000

62 rows × 3 columns

In [49]:
# plot mean values by year and style
fig, ax = plt.subplots(figsize=(8, 6))
for style, data in meanSongs.groupby('style'):
    ax.plot(data['year'], data['mean_by_year_style'], label=style)
ax.legend()
ax.set_xlabel('Year')
ax.set_ylabel('Mean Value')
ax.set_title('Mean Value by Year and Style')
plt.show()

Antros Hipotezės išvados¶

Dėl didėlio pop dainų kiekio vidurkis yra mažesnis negu kitų dainų vidurkiai.

Taškai pagal dainos kalbą¶

In [51]:
languageSongs = songs.groupby(['year', 'language'])['final_total_points'].mean().sort_values(ascending=False)
print(languageSongs.to_string())
year  language               
2017  Portuguese                 758.000000
2022  Ukrainian                  631.000000
2016  English, Crimean Tatar     534.000000
2015  Italian                    527.000000
2021  Italian                    524.000000
2019  Italian                    472.000000
2021  French                     465.500000
2022  Spanish, English           459.000000
2012  Udmurt, English            426.000000
      Serbian                    384.000000
2021  Ukrainian                  364.000000
2011  Italian, English           350.000000
2017  Italian                    334.000000
2022  Serbian, Latin             312.000000
2016  English, Bulgarian         307.000000
2018  Italian                    276.500000
2022  Italian                    268.000000
2012  Albanian                   263.000000
2010  Greek                      262.000000
2011  English, Greek             260.000000
2016  French, English            257.000000
2022  Romanian, English          253.000000
2009  Estonian                   253.000000
      English                    239.615385
2019  Icelandic                  232.000000
2012  Estonian                   230.000000
2018  English                    218.117647
2009  French                     218.000000
      Bosnian                    214.000000
2012  English, Italian           213.000000
2022  English, Portuguese        207.000000
2021  Russian, English           204.000000
2011  Serbian                    200.000000
2017  Hungarian                  200.000000
2012  Spanish                    199.000000
2019  English                    193.833333
2011  English                    187.388889
2014  Polish, English            185.000000
2010  French                     185.000000
2009  Russian, Ukrainian         185.000000
2018  Albanian                   184.000000
2009  English, Armenian          182.000000
2010  English                    179.052632
2018  French                     173.000000
2022  Dutch                      171.000000
2012  Spanish, English           170.000000
2017  English                    169.947368
2016  English                    166.619048
2011  Corsican                   166.000000
2012  English                    164.125000
2011  English, Russian           163.000000
2014  English                    161.250000
2010  Hebrew                     161.000000
2015  English                    160.190476
2009  Romanian, English          159.000000
2016  French                     151.000000
2022  English                    150.642857
2010  Spanish                    149.000000
      Serbian                    147.000000
2017  French, English            135.000000
2022  Lithuanian                 128.000000
2017  English, Italian           128.000000
2021  English                    126.466667
2016  Italian, English           124.000000
2011  English, Hungarian         124.000000
2009  English, Hebrew, Arabic    122.000000
2019  English, French            120.000000
2009  Croatian                   113.000000
2018  Serbian                    113.000000
2014  English, Spanish           112.000000
2011  Spanish                    111.000000
2009  Portuguese                 109.000000
2019  Slovene                    105.000000
      French, English            105.000000
2021  Serbian                    102.000000
2010  Portuguese                  93.000000
2018  Hungarian                   93.000000
2019  Albanian                    90.000000
2015  Romanian, English           90.000000
2019  Serbian                     89.000000
2009  French, English             86.000000
2012  French, English             85.000000
2017  Belarusian                  83.000000
2015  Montenegrin                 78.000000
2014  Montenegrin                 75.000000
2009  English, Russian            69.000000
2022  English, Spanish            65.000000
2018  Slovene                     64.000000
      Spanish                     61.000000
2014  Italian                     59.000000
2021  Albanian                    57.000000
2019  Spanish                     54.000000
2009  Spanish, English            47.000000
2018  Portuguese                  39.000000
2014  English, Slovene            36.000000
2015  Spanish                     32.000000
      French                      27.000000
2021  English, Sranan Tongo       23.000000
2022  Icelandic                   20.000000
      Breton                      17.000000
2021  Spanish                      6.000000
2014  French                       6.000000
2017  Spanish, English             5.000000
2013  Romanian                     0.000000
      Italian                      0.000000
      Icelandic                    0.000000
      Hungarian                    0.000000
      Spanish                      0.000000
      French                       0.000000
      Estonian                     0.000000
      English                      0.000000
2012  Macedonian                   0.000000
      Bosnian                      0.000000
2013  Greek                        0.000000
In [43]:
languageSongsbyStyle = songs.groupby(['language'])['final_total_points'].mean().sort_values(ascending=False)
print(languageSongsbyStyle.to_string())
language
English, Crimean Tatar     534.000000
Ukrainian                  497.500000
Udmurt, English            426.000000
Serbian, Latin             312.000000
English, Bulgarian         307.000000
Italian                    304.111111
English, Greek             260.000000
Portuguese                 249.750000
Italian, English           237.000000
English, Portuguese        207.000000
Russian, English           204.000000
French                     187.888889
Russian, Ukrainian         185.000000
Polish, English            185.000000
English, Armenian          182.000000
Serbian                    172.500000
Dutch                      171.000000
English, Italian           170.500000
Spanish, English           170.250000
Romanian, English          167.333333
Corsican                   166.000000
English                    161.729258
Hebrew                     161.000000
Estonian                   161.000000
Albanian                   148.500000
French, English            133.600000
Greek                      131.000000
Lithuanian                 128.000000
English, Hungarian         124.000000
English, Hebrew, Arabic    122.000000
English, French            120.000000
English, Russian           116.000000
Croatian                   113.000000
Bosnian                    107.000000
Hungarian                   97.666667
English, Spanish            88.500000
Slovene                     84.500000
Icelandic                   84.000000
Belarusian                  83.000000
Montenegrin                 76.500000
Spanish                     76.500000
English, Slovene            36.000000
English, Sranan Tongo       23.000000
Breton                      17.000000
Macedonian                   0.000000
Romanian                     0.000000
In [44]:
languageSongsbyStyle.describe()
Out[44]:
count     46.000000
mean     167.159332
std      113.433223
min        0.000000
25%      100.000000
50%      161.000000
75%      187.166667
max      534.000000
Name: final_total_points, dtype: float64
In [65]:
songs.value_counts()
Out[65]:
Series([], Name: count, dtype: int64)
In [46]:
languageSongs = songs.groupby(['year', 'language'])['final_total_points'].mean().unstack()
languageSongs.plot(kind='bar', stacked=True)

plt.xlabel('Year')
plt.ylabel('Mean final total points')
plt.title('Mean total points by year and language')
plt.legend(title='Language', bbox_to_anchor=(1.05, 1))

plt.show()

Trečios hipotezės išvados¶

Gryna anglų kalbą yra tarp šiek tiek virš 50 procentilio. Tačiau jeigu sujungus ukrainiečių kalbas, junginys būtų pats auksčiaučias tašku vidurkis tarp duomenų rinkinio.

Mašininis modelis¶

In [64]:
new_data = pd.DataFrame({'year': [2023], 'style': ['Rock']})


# Prepare the data
X = songs[['year', 'style']] # Features
y = songs['final_place'] # Target

# Create a Linear Regression model object
model = LinearRegression()

# Encode the categorical variable 
le = LabelEncoder()
X['style'] = le.fit_transform(X['style'])

# Train the model
model.fit(X, y)

# Print the coefficients
print(model.coef_)

# Make predictions on new data
new_data['style'] = le.transform(new_data['style'])
prediction = model.predict(new_data)
print(prediction)
[0.00253852 0.14683223]
[13.67136886]
C:\Users\Rytis\AppData\Local\Temp\ipykernel_19420\3447968430.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['style'] = le.fit_transform(X['style'])